This project analyzes the newseeds.csv dataset, which
contains measurements of geometrical properties of kernels from three
wheat varieties. Using R, I explored the data through thorough
statistical analysis and clustering techniques. Key tasks include:
This project highlights robust data analysis, clustering techniques, and clean, efficient R code.
df <- read.csv("newseeds.csv")
head(df)
## V1 V2 V3 V4 V5 V6 V7 Type
## 1 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 A
## 2 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 A
## 3 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 A
## 4 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 A
## 5 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 A
## 6 14.38 14.21 0.8951 5.386 3.312 2.462 4.956 A
# Get the dimensions of the dataset
dataset_dimensions <- dim(df)
# Print the dimensions
cat("Number of rows:", dataset_dimensions[1], "\n")
## Number of rows: 206
cat("Number of columns:", dataset_dimensions[2], "\n")
## Number of columns: 8
summary(df)
## V1 V2 V3 V4
## Min. :10.59 Min. :12.41 Min. :0.8081 Min. :4.899
## 1st Qu.:12.23 1st Qu.:13.44 1st Qu.:0.8565 1st Qu.:5.253
## Median :14.38 Median :14.32 Median :0.8734 Median :5.524
## Mean :14.85 Mean :14.56 Mean :0.8709 Mean :5.628
## 3rd Qu.:17.35 3rd Qu.:15.75 3rd Qu.:0.8878 3rd Qu.:5.980
## Max. :21.18 Max. :17.25 Max. :0.9183 Max. :6.675
## V5 V6 V7 Type
## Min. :2.630 Min. :0.7651 Min. :4.519 Length:206
## 1st Qu.:2.937 1st Qu.:2.5162 1st Qu.:5.045 Class :character
## Median :3.244 Median :3.5990 Median :5.226 Mode :character
## Mean :3.259 Mean :3.7073 Mean :5.411
## 3rd Qu.:3.563 3rd Qu.:4.8120 3rd Qu.:5.877
## Max. :4.033 Max. :8.4560 Max. :6.550
sum(is.na(df))
## [1] 0
# Check for missing values in the entire dataset
missing_values_summary <- sapply(df, function(x) sum(is.na(x)))
# Print the summary of missing values
print(missing_values_summary)
## V1 V2 V3 V4 V5 V6 V7 Type
## 0 0 0 0 0 0 0 0
The dataset has no missing values.
# Count the number of seeds by type
type_counts <- table(df$Type)
# Draw the bar chart
barplot(type_counts, main="Bar Chart of Seed Types",
xlab="Type", ylab="Frequency", col='lightblue', border='black')
type_counts
##
## A B C
## 67 69 70
Seed type A, b, c have similar counts.
# Load necessary library for plotting
library(ggplot2)
# Create histograms for each numerical variable
numeric_columns <- df[, -which(names(df) == "Type")]
# Set up plotting area
par(mfrow=c(2, 4)) # Adjust the layout as needed, here 2 rows and 4 columns
# Plot histograms
for (col_name in names(numeric_columns)) {
hist(numeric_columns[[col_name]], main=paste("Histogram of", col_name),
xlab=col_name, col='lightblue', border='black')
}
Based on the observation:
The distribution of V3 seems to be left-skewed.
The distribution of V1, V2,V4 and V7 seems to be right-skewed.
The distribution of V5 and V6 seems to be normally distributed.
# install.packages("GGally")
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
# Ensure 'Type' is treated as a factor for coloring
df$Type <- as.factor(df$Type)
# Create the pair plot
pair_plot <- ggpairs(df,
columns = 1:7, # Adjust according to the number of numerical columns
aes(color = Type, alpha = 0.7),
title = "Pair Plot of Numerical Variables by Seed Type")
# Print the pair plot
print(pair_plot)
There are a lot of pairs that are linear correlated.
Example: V1 seems to be highly correlated with V2, V3, V4 and V7.
V2 seems to be highly correlated with V1, V4, V5, v7 …
correlation_matrix <- cor(df[, -which(names(df) == "Type")], use = "complete.obs")
# Print the correlation matrix
print(correlation_matrix)
## V1 V2 V3 V4 V5 V6
## V1 1.0000000 0.9944304 0.6143748 0.9504868 0.9709062 -0.23223141
## V2 0.9944304 1.0000000 0.5364040 0.9725679 0.9452655 -0.22014963
## V3 0.6143748 0.5364040 1.0000000 0.3759919 0.7662568 -0.33243778
## V4 0.9504868 0.9725679 0.3759919 1.0000000 0.8613990 -0.17438286
## V5 0.9709062 0.9452655 0.7662568 0.8613990 1.0000000 -0.26033211
## V6 -0.2322314 -0.2201496 -0.3324378 -0.1743829 -0.2603321 1.00000000
## V7 0.8654484 0.8922634 0.2360760 0.9348281 0.7511749 -0.01258409
## V7
## V1 0.86544836
## V2 0.89226342
## V3 0.23607598
## V4 0.93482807
## V5 0.75117490
## V6 -0.01258409
## V7 1.00000000
# Optional: Visualize the correlation matrix using a heatmap
library(heatmaply)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
##
## ======================
## Welcome to heatmaply version 1.5.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
heatmaply(correlation_matrix, main = "Correlation Heatmap")
# Print the counts for each type
print(type_counts)
##
## A B C
## 67 69 70
# Extract counts for specific types if needed
type_A_count <- type_counts["A"]
type_B_count <- type_counts["B"]
type_C_count <- type_counts["C"]
# Print individual counts
cat("Number of Type A seeds:", type_A_count, "\n")
## Number of Type A seeds: 67
cat("Number of Type B seeds:", type_B_count, "\n")
## Number of Type B seeds: 69
cat("Number of Type C seeds:", type_C_count, "\n")
## Number of Type C seeds: 70
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Summary statistics for each variable by type
summary_stats <- df %>%
group_by(Type) %>%
summarise(across(V1:V7, list(mean = mean, sd = sd, median = median), .names = "{col}_{fn}"))
# Print summary statistics
print(summary_stats)
## # A tibble: 3 × 22
## Type V1_mean V1_sd V1_median V2_mean V2_sd V2_median V3_mean V3_sd V3_median
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 14.4 1.24 14.4 14.3 0.586 14.4 0.880 0.0163 0.880
## 2 B 18.4 1.44 18.7 16.1 0.619 16.2 0.884 0.0154 0.883
## 3 C 11.9 0.723 11.8 13.2 0.340 13.2 0.849 0.0218 0.849
## # ℹ 12 more variables: V4_mean <dbl>, V4_sd <dbl>, V4_median <dbl>,
## # V5_mean <dbl>, V5_sd <dbl>, V5_median <dbl>, V6_mean <dbl>, V6_sd <dbl>,
## # V6_median <dbl>, V7_mean <dbl>, V7_sd <dbl>, V7_median <dbl>
The statistics are different between each type for each variables.
plot_list <- list()
for (var in names(df)[1:7]) {
p <- ggplot(df, aes(x = Type, y = .data[[var]], fill = Type)) +
geom_boxplot() +
labs(title = paste("Boxplot of", var, "by Type"), x = "Type", y = var) +
theme_minimal()
plot_list[[var]] <- p
}
# Display plots
print(plot_list$V1)
print(plot_list$V2)
print(plot_list$V3)
print(plot_list$V4)
print(plot_list$V5)
print(plot_list$V6)
print(plot_list$V7)
We can see that V3 and V6 have different distributions compared with other variables.
I will perform ANOVA test to compare the mean between each type for each variable
For each variable (V1 to V7) and the seed types (A, B, C) , the hypothesis can be expressed as:
H0:μA=μB=μC or All seed types’ means are equal (no significant differences among the means).
Where μA, μB, and μC are the population means of the seed types A, B, and C, respectively.
H1: At least one pair of group means is different (there is a significant difference among the means).
# List of variable names
variables <- names(df)[1:7]
# Perform ANOVA for each variable and store the results
anova_results <- lapply(variables, function(var) {
formula <- as.formula(paste(var, "~ Type"))
aov(formula, data = df)
})
# Summarize the results
summaries <- lapply(anova_results, summary)
# Print the summaries
for (i in 1:length(variables)) {
cat("ANOVA for", variables[i], ":\n")
print(summaries[[i]])
cat("\n")
}
## ANOVA for V1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 2 1484 742.2 542 <2e-16 ***
## Residuals 203 278 1.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ANOVA for V2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 2 297.78 148.89 533.4 <2e-16 ***
## Residuals 203 56.66 0.28
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ANOVA for V3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 2 0.04955 0.024775 75.75 <2e-16 ***
## Residuals 203 0.06640 0.000327
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ANOVA for V4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 2 30.729 15.365 314.3 <2e-16 ***
## Residuals 203 9.922 0.049
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ANOVA for V5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 2 23.762 11.881 403.1 <2e-16 ***
## Residuals 203 5.983 0.029
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ANOVA for V6 :
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 2 158.0 79.00 51.28 <2e-16 ***
## Residuals 203 312.8 1.54
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ANOVA for V7 :
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 2 38.70 19.348 359.7 <2e-16 ***
## Residuals 203 10.92 0.054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Intepretation with the levels of significance: α=0.05
For V1, p-value = <2e-16 which is smaller than 0.05,so we are 95% confident that we have enough evidence to reject the null hypothesis and conclude that there are statistically significant differences among the means of the three types/varieties of seeds for V1.
For V2, p-value = <2e-16 which is smaller than 0.05,so we are 95% confident that we have enough evidence to reject the null hypothesis and conclude that there are statistically significant differences among the means of the three types/varieties of seeds for V2.
For V3, p-value = <2e-16 which is smaller than 0.05,so we are 95% confident that we have enough evidence to reject the null hypothesis and conclude that there are statistically significant differences among the means of the three types/varieties of seeds for V3.
For V4, p-value = <2e-16 which is smaller than 0.05,so we are 95% confident that we have enough evidence to reject the null hypothesis and conclude that there are statistically significant differences among the means of the three types/varieties of seeds for V4.
For V5, p-value = <2e-16 which is smaller than 0.05,so we are 95% confident that we have enough evidence to reject the null hypothesis and conclude that there are statistically significant differences among the means of the three types/varieties of seeds for V5.
For V6, p-value = <2e-16 which is smaller than 0.05,so we are 95% confident that we have enough evidence to reject the null hypothesis and conclude that there are statistically significant differences among the means of the three types/varieties of seeds for V6.
For V7, p-value = <2e-16 which is smaller than 0.05,so we are 95% confident that we have enough evidence to reject the null hypothesis and conclude that there are statistically significant differences among the means of the three types/varieties of seeds for V7.
I decided to scale the data before runing K-means because variables (V1 to V7) have different scales (units).
K-means clustering is a distance-based algorithm, and variables with larger ranges can dominate the distance calculation. For this reason, scaling is crucial for K-means clustering to ensure that all variables contribute equally to the clustering process.
summary(df)
## V1 V2 V3 V4
## Min. :10.59 Min. :12.41 Min. :0.8081 Min. :4.899
## 1st Qu.:12.23 1st Qu.:13.44 1st Qu.:0.8565 1st Qu.:5.253
## Median :14.38 Median :14.32 Median :0.8734 Median :5.524
## Mean :14.85 Mean :14.56 Mean :0.8709 Mean :5.628
## 3rd Qu.:17.35 3rd Qu.:15.75 3rd Qu.:0.8878 3rd Qu.:5.980
## Max. :21.18 Max. :17.25 Max. :0.9183 Max. :6.675
## V5 V6 V7 Type
## Min. :2.630 Min. :0.7651 Min. :4.519 A:67
## 1st Qu.:2.937 1st Qu.:2.5162 1st Qu.:5.045 B:69
## Median :3.244 Median :3.5990 Median :5.226 C:70
## Mean :3.259 Mean :3.7073 Mean :5.411
## 3rd Qu.:3.563 3rd Qu.:4.8120 3rd Qu.:5.877
## Max. :4.033 Max. :8.4560 Max. :6.550
df_km <- df[, -8]
df.scaled <- scale(df_km)
head(df.scaled)
## V1 V2 V3 V4 V5 V6
## [1,] 0.139142273 0.212313721 0.002980093 0.30272355 0.1390616 -0.9808019
## [2,] 0.009536643 0.006977446 0.427663815 -0.16661241 0.1941918 -1.7746623
## [3,] -0.191693150 -0.358064821 1.432608464 -0.75721220 0.2046928 -0.6653694
## [4,] -0.345173501 -0.472140529 1.033153478 -0.68310653 0.3149532 -0.9557257
## [5,] 0.439281626 0.326389430 1.365331835 0.06693275 0.7953735 -1.5522758
## [6,] -0.160997080 -0.266804254 1.016334321 -0.54387768 0.1390616 -0.8217659
## V7
## [1,] -0.3879479
## [2,] -0.9245782
## [3,] -1.1908606
## [4,] -1.2315144
## [5,] -0.4794189
## [6,] -0.9245782
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
set.seed(123)
km.res <- kmeans(df.scaled, 3, nstart = 25)
#####Print the results
print(km.res)
## K-means clustering with 3 clusters of sizes 72, 68, 66
##
## Cluster means:
## V1 V2 V3 V4 V5 V6
## 1 -1.0215292 -0.9985788 -0.9534927 -0.8902656 -1.07500585 0.68306408
## 2 -0.1326082 -0.1613401 0.4555515 -0.2518802 0.01185297 -0.67464490
## 3 1.2510221 1.2555879 0.5708178 1.2307117 1.16052151 -0.05007213
## V7
## 1 -0.6283700
## 2 -0.5815018
## 3 1.2846176
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 1 3 3 3 3 3 3 3
## [75] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 2 3 3 2 3 2 2 3 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 142.6614 139.3420 136.1842
## (between_SS / total_SS = 70.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
####Accessing to the results of kmeans() function
#####Cluster number for each of the observations
#km.res$cluster
head(km.res$cluster, 10)
## [1] 2 2 2 2 2 2 2 2 3 2
#####Cluster size
km.res$size
## [1] 72 68 66
#####Cluster means
km.res$centers
## V1 V2 V3 V4 V5 V6
## 1 -1.0215292 -0.9985788 -0.9534927 -0.8902656 -1.07500585 0.68306408
## 2 -0.1326082 -0.1613401 0.4555515 -0.2518802 0.01185297 -0.67464490
## 3 1.2510221 1.2555879 0.5708178 1.2307117 1.16052151 -0.05007213
## V7
## 1 -0.6283700
## 2 -0.5815018
## 3 1.2846176
####Visualizing k-means clusters
If we have a multi-dimensional data set, a solution is to perform Principal Component Analysis (PCA) and to plot data points according to the first two principal components coordinates.
fviz_cluster(km.res, data = df.scaled,
palette = c("#2E9FDF", "#E7B800", "#FC4E07"),
ellipse.type = "euclid", # Concentration ellipse
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
ggtheme = theme_minimal()
)
# K-means clustering
km.res1 <- eclust(df.scaled, "kmeans", k = 3, nstart = 25, graph = FALSE)
# Visualize k-means clusters
fviz_cluster(km.res1, geom = "point", ellipse.type = "norm",
palette = "jco", ggtheme = theme_minimal())
fviz_silhouette(km.res1, palette = "jco",
ggtheme = theme_classic())
## cluster size ave.sil.width
## 1 1 72 0.40
## 2 2 68 0.34
## 3 3 66 0.47
The silhouette coefficient (Si) measures how similar an object i is to
the the other objects in its own cluster versus those in the neighbor
cluster. Si values range from 1 to - 1:
# Silhouette information
silinfo <- km.res1$silinfo
names(silinfo)
## [1] "widths" "clus.avg.widths" "avg.width"
# Silhouette widths of each observation
head(silinfo$widths[, 1:3], 10)
## cluster neighbor sil_width
## 173 1 2 0.5901204
## 170 1 2 0.5853422
## 187 1 2 0.5795698
## 174 1 2 0.5643366
## 190 1 2 0.5634643
## 175 1 2 0.5629583
## 152 1 2 0.5588940
## 159 1 2 0.5581227
## 169 1 2 0.5575083
## 165 1 2 0.5564944
# Silhouette width of observation
sil <- km.res1$silinfo$widths[, 1:3]
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
## cluster neighbor sil_width
## 25 2 1 -0.0003692162
## 26 2 1 -0.0038330091
negative_silhouette_count <- length(neg_sil_index)
negative_silhouette_count
## [1] 2
For the entire data, there are 2 negative silhouette scores from cluster 2, neighbor 1 with sil_width:-0.0003692162 and -0.0038330091.
dim(sil)[1]
## [1] 206
# Objects with negative silhouette
sil_index_0.1 <- which(sil[, 'sil_width'] < 0.1)
sil[sil_index_0.1, , drop = FALSE]
## cluster neighbor sil_width
## 202 1 2 0.0940317312
## 192 1 2 0.0578815361
## 57 1 2 0.0549802854
## 176 1 2 0.0536805280
## 61 1 2 0.0460482266
## 59 1 2 0.0091485047
## 10 2 3 0.0900802544
## 41 2 3 0.0859849738
## 22 2 1 0.0847007083
## 28 2 1 0.0668964638
## 194 2 1 0.0633440488
## 134 2 3 0.0465193468
## 38 2 1 0.0171125792
## 129 2 3 0.0084880391
## 25 2 1 -0.0003692162
## 26 2 1 -0.0038330091
## 36 3 2 0.0918121919
## 131 3 2 0.0632086176
## 97 3 2 0.0413410884
## 119 3 2 0.0237154838
length(sil_index_0.1)
## [1] 20
# percentage
percentage_less_0.1 <- (length(sil_index_0.1) / dim(sil)[1])*100
percentage_less_0.1
## [1] 9.708738
For the entire dataset, we have 20 silhouette scores have values that are less than 0.1.
For the entire dataset, the percentage of the silhouette scores have values that are less than 0.1 is 9.708738%.
rows_less_than_0.1_by_cluster <- lapply(unique(sil$cluster), function(cluster_num) {
rows <- which(sil$cluster == cluster_num & sil$sil_width < 0.1)
return(rows)
})
# Print the rows with silhouette scores less than 0.1 for each cluster
names(rows_less_than_0.1_by_cluster) <- paste("Cluster", unique(sil$cluster))
rows_less_than_0.1_by_cluster
## $`Cluster 1`
## [1] 67 68 69 70 71 72
##
## $`Cluster 2`
## [1] 131 132 133 134 135 136 137 138 139 140
##
## $`Cluster 3`
## [1] 203 204 205 206
Mapping to the original dataset
# Identify rows in the original dataset (df) that have silhouette scores < 0.1 for each cluster
rows_in_df_less_than_0.1_by_cluster <- lapply(rows_less_than_0.1_by_cluster, function(row_indices) {
df[row_indices, ]
})
# Print the rows of the original dataset with silhouette scores < 0.1 for each cluster
names(rows_in_df_less_than_0.1_by_cluster) <- paste("Cluster", unique(sil$cluster))
rows_in_df_less_than_0.1_by_cluster
## $`Cluster 1`
## V1 V2 V3 V4 V5 V6 V7 Type
## 67 12.73 13.75 0.8458 5.412 2.882 3.533 5.067 A
## 68 17.63 15.98 0.8673 6.191 3.561 4.076 6.060 B
## 69 16.84 15.67 0.8623 5.998 3.484 4.675 5.877 B
## 70 17.26 15.73 0.8763 5.978 3.594 4.539 5.791 B
## 71 19.11 16.26 0.9081 6.154 3.930 2.936 6.079 B
## 72 16.82 15.51 0.8786 6.017 3.486 4.004 5.841 B
##
## $`Cluster 2`
## V1 V2 V3 V4 V5 V6 V7 Type
## 131 15.56 14.89 0.8823 5.776 3.408 4.972 5.847 B
## 132 15.38 14.66 0.8990 5.477 3.465 3.600 5.439 B
## 133 17.36 15.76 0.8785 6.145 3.574 3.526 5.971 B
## 134 15.57 15.15 0.8527 5.920 3.231 2.640 5.879 B
## 135 15.60 15.11 0.8580 5.832 3.286 2.725 5.752 B
## 136 16.23 15.18 0.8850 5.872 3.472 3.769 5.922 B
## 137 13.07 13.92 0.8480 5.472 2.994 5.304 5.395 C
## 138 13.32 13.94 0.8613 5.541 3.073 7.035 5.440 C
## 139 13.34 13.95 0.8620 5.389 3.074 5.995 5.307 C
## 140 12.22 13.32 0.8652 5.224 2.967 5.469 5.221 C
##
## $`Cluster 3`
## V1 V2 V3 V4 V5 V6 V7 Type
## 203 11.23 12.88 0.8511 5.140 2.795 4.325 5.003 C
## 204 13.20 13.66 0.8883 5.236 3.232 8.315 5.056 C
## 205 11.84 13.21 0.8521 5.175 2.836 3.598 5.044 C
## 206 12.30 13.34 0.8684 5.243 2.974 5.637 5.063 C
# Average silhouette width of each cluster
silinfo$clus.avg.widths
## [1] 0.4024117 0.3365185 0.4682397
# Calculate the average silhouette score for each cluster
average_silhouette_by_cluster <- aggregate(sil_width ~ cluster, data = sil, mean)
# Print the average silhouette score for each cluster (to 3 decimal places)
average_silhouette_by_cluster$sil_width <- round(average_silhouette_by_cluster$sil_width, 3)
average_silhouette_by_cluster
## cluster sil_width
## 1 1 0.402
## 2 2 0.337
## 3 3 0.468
The average silhouette score (exact to 3 decimal places) for cluster 1: 0.402. The average silhouette score (exact to 3 decimal places) for cluster 2: 0.337.
The average silhouette score (exact to 3 decimal places) for cluster 3: 0.468.
# The total average (mean of all individual silhouette widths)
silinfo$avg.width
## [1] 0.4017511
round(silinfo$avg.width,3)
## [1] 0.402
The average silhouette score (exact to 3 decimal places) for the entire dataset is 0.402.
fviz_nbclust(df.scaled, kmeans, iter.max = 10, nstart = 25, method = "wss") +
geom_vline(xintercept = 3, linetype = 2)+
labs(subtitle = "Elbow method")
fviz_nbclust(df.scaled, kmeans, iter.max = 10, nstart = 25, method = "silhouette")+
labs(subtitle = "Silhouette method")
# nboot = 50 to keep the function speedy.
# recommended value: nboot= 500 for your analysis.
# Use verbose = FALSE to hide computing progression.
set.seed(123)
fviz_nbclust(df.scaled, kmeans,iter.max = 10, nstart = 25, method = "gap_stat", nboot = 50)+
labs(subtitle = "Gap statistic method")
- Elbow method: 3 clusters solution suggested
Silhouette method: 2 clusters solution suggested
Gap statistic method: 3 clusters solution suggested
According to these observations, it’s possible to define k = 3 as the optimal number of clusters in the data.
library(clValid)
# Compute clValid
clmethods <- c("hierarchical","kmeans","pam")
intern <- clValid(df.scaled, nClust = 2:6,
clMethods = clmethods, validation = "internal")
## Warning in clValid(df.scaled, nClust = 2:6, clMethods = clmethods, validation =
## "internal"): rownames for data not specified, using 1:nrow(data)
# Summary
summary(intern)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## hierarchical Connectivity 17.2488 26.0091 37.9290 41.0579 49.6516
## Dunn 0.1164 0.1164 0.1059 0.1059 0.1158
## Silhouette 0.4411 0.3145 0.3737 0.2942 0.3043
## kmeans Connectivity 17.1071 40.0107 63.2012 77.1575 96.4766
## Dunn 0.0871 0.1192 0.0665 0.0815 0.0815
## Silhouette 0.4661 0.4018 0.3474 0.2889 0.2737
## pam Connectivity 28.7833 41.4198 56.7365 73.1250 82.9833
## Dunn 0.0590 0.1115 0.1115 0.0817 0.0652
## Silhouette 0.4650 0.3996 0.3325 0.2681 0.2563
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 17.1071 kmeans 2
## Dunn 0.1192 kmeans 3
## Silhouette 0.4661 kmeans 2
It can be seen that Kmeans clustering with two clusters performs the best in each case for connectivity and Silhouette measures).
However, the number of clusters is appropriate based on the Dunn index is 3.
# Define k values and metrics
k_values <- c(2, 3, 4, 5, 6)
connectivity <- c(17.1071, 40.0107, 63.2012, 77.1575, 96.4766)
dunn <- c(0.0871, 0.1192, 0.0665, 0.0815, 0.0815)
silhouette <- c(0.4661, 0.4018, 0.3474, 0.2889, 0.2737)
# Create a data frame for each metric
df_metrics <- data.frame(
k = rep(k_values, 3),
Metric = rep(c("Connectivity", "Dunn Index", "Silhouette Score"), each = length(k_values)),
Value = c(connectivity, dunn, silhouette)
)
# Plot Connectivity
p1 <- ggplot(subset(df_metrics, Metric == "Connectivity"), aes(x = k, y = Value, color = Metric)) +
geom_line() +
geom_point() +
labs(title = "Connectivity for Different k Values",
x = "Number of Clusters (k)",
y = "Connectivity") +
theme_minimal()
# Plot Dunn Index
p2 <- ggplot(subset(df_metrics, Metric == "Dunn Index"), aes(x = k, y = Value, color = Metric)) +
geom_line() +
geom_point() +
labs(title = "Dunn Index for Different k Values",
x = "Number of Clusters (k)",
y = "Dunn Index") +
theme_minimal()
# Plot Silhouette Scores
p3 <- ggplot(subset(df_metrics, Metric == "Silhouette Score"), aes(x = k, y = Value, color = Metric)) +
geom_line() +
geom_point() +
labs(title = "Silhouette Scores for Different k Values",
x = "Number of Clusters (k)",
y = "Silhouette Score") +
theme_minimal()
# Display plots
print(p1)
print(p2)
print(p3)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Combine the plots into a grid
grid.arrange(p1, p2, p3, ncol = 1)
# Stability measures
clmethods <- c("hierarchical","kmeans","pam")
stab <- clValid(df.scaled, nClust = 2:6, clMethods = clmethods,
validation = "stability")
## Warning in clValid(df.scaled, nClust = 2:6, clMethods = clmethods, validation =
## "stability"): rownames for data not specified, using 1:nrow(data)
# Display only optimal Scores
summary(stab)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## hierarchical APN 0.0877 0.1678 0.1550 0.1791 0.1444
## AD 2.4434 2.3917 1.9967 1.9740 1.8504
## ADM 0.3641 0.5787 0.4695 0.4990 0.4973
## FOM 0.6981 0.6659 0.5801 0.5724 0.5592
## kmeans APN 0.0383 0.0996 0.1692 0.2184 0.2938
## AD 2.3315 1.9422 1.8847 1.7736 1.7386
## ADM 0.1772 0.3141 0.4626 0.5381 0.6714
## FOM 0.6714 0.5703 0.5624 0.5268 0.5204
## pam APN 0.0425 0.0809 0.1737 0.1459 0.3166
## AD 2.3441 1.9274 1.8452 1.6996 1.7235
## ADM 0.1676 0.2378 0.4387 0.3275 0.6134
## FOM 0.6586 0.5628 0.5335 0.5077 0.5149
##
## Optimal Scores:
##
## Score Method Clusters
## APN 0.0383 kmeans 2
## AD 1.6996 pam 5
## ADM 0.1676 pam 2
## FOM 0.5077 pam 5
By using Kmeans clustering technique:
For the APN Kmeans clustering with two clusters again gives the best score (lowest score).
For the AD Kmeans clustering with six clusters again gives the best score (lowest score).
For the ADM Kmeans clustering with two clusters again gives the best score (lowest score).
For the FOM Kmeans clustering with six clusters again gives the best score (lowest score).
# Define k values and metrics
k_values <- c(2, 3, 4, 5, 6)
apn <- c(0.0383, 0.0996, 0.1692, 0.2184, 0.2938)
ad <- c(2.3315, 1.9422, 1.8847, 1.7736, 1.7386)
adm <- c(0.1772, 0.3141, 0.4626, 0.5381, 0.6714)
fom <- c(0.6714, 0.5703, 0.5624, 0.5268, 0.5204)
# Combine into a single data frame
df_metrics <- data.frame(
k = rep(k_values, 4),
Metric = rep(c("APN", "AD", "ADM", "FOM"), each = length(k_values)),
Value = c(apn, ad, adm, fom)
)
# Plot APN
p1 <- ggplot(subset(df_metrics, Metric == "APN"), aes(x = k, y = Value, color = Metric)) +
geom_line() +
geom_point() +
labs(title = "APN for Different k Values",
x = "Number of Clusters (k)",
y = "APN") +
theme_minimal()
# Plot AD
p2 <- ggplot(subset(df_metrics, Metric == "AD"), aes(x = k, y = Value, color = Metric)) +
geom_line() +
geom_point() +
labs(title = "AD for Different k Values",
x = "Number of Clusters (k)",
y = "AD") +
theme_minimal()
# Plot ADM
p3 <- ggplot(subset(df_metrics, Metric == "ADM"), aes(x = k, y = Value, color = Metric)) +
geom_line() +
geom_point() +
labs(title = "ADM for Different k Values",
x = "Number of Clusters (k)",
y = "ADM") +
theme_minimal()
# Plot FOM
p4 <- ggplot(subset(df_metrics, Metric == "FOM"), aes(x = k, y = Value, color = Metric)) +
geom_line() +
geom_point() +
labs(title = "FOM for Different k Values",
x = "Number of Clusters (k)",
y = "FOM") +
theme_minimal()
# Display plots
print(p1)
print(p2)
print(p3)
print(p4)
library(gridExtra)
# Combine the plots into a grid
grid.arrange(p1, p2, p3, p4, ncol = 2)
Conclusion: Each technique may result in a different number of optimal clusters.